require(quanteda)
require(newsmap)
source("functions.R")

toks <- readRDS("data_tokens_ja.RDS") %>% 
    tokens_remove('\\d', valuetype = 'regex', min_nchar = 2)

dfmt <- dfm(toks, remove_padding = TRUE) %>% 
        dfm_trim(min_termfreq = 50) %>% 
        dfm_group()

toks_key <- tokens_lookup(toks, data_dictionary_newsmap_ja, levels = 3, 
                          nested_scope = "dictionary")
dfmt_key <- dfm(toks_key) %>% 
            dfm_group()

newsmap <- textmodel_newsmap(dfmt, dfmt_key)

pred <- predict(newsmap, confidence.fit = TRUE)
dat <- data.frame(pred, docid = names(pred$class))
dat$class[dat$confidence.fit < 0] <- "jp"
sort(table(dat$class))

mat <- predict(newsmap, type = "all")
dat$alone <- rowSums(mat > 0) <= 1

saveRDS(dat, "data_newsmap_ja.RDS")

